/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * CheckClusterer.java
 * Copyright (C) 2006-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.clusterers;

import java.util.Collections;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.core.CheckScheme;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.MultiInstanceCapabilitiesHandler;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.SerializationHelper;
import weka.core.TestInstances;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;

/**
 * Class for examining the capabilities and finding problems with clusterers. If
 * you implement a clusterer using the WEKA.libraries, you should run the checks
 * on it to ensure robustness and correct operation. Passing all the tests of
 * this object does not mean bugs in the clusterer don't exist, but this will
 * help find some common ones.
 * <p/>
 * 
 * Typical usage:
 * <p/>
 * <code>java weka.clusterers.CheckClusterer -W clusterer_name 
 * -- clusterer_options </code>
 * <p/>
 * 
 * CheckClusterer reports on the following:
 * <ul>
 * <li>Clusterer abilities
 * <ul>
 * <li>Possible command line options to the clusterer</li>
 * <li>Whether the clusterer can predict nominal, numeric, string, date or
 * relational class attributes.</li>
 * <li>Whether the clusterer can handle numeric predictor attributes</li>
 * <li>Whether the clusterer can handle nominal predictor attributes</li>
 * <li>Whether the clusterer can handle string predictor attributes</li>
 * <li>Whether the clusterer can handle date predictor attributes</li>
 * <li>Whether the clusterer can handle relational predictor attributes</li>
 * <li>Whether the clusterer can handle multi-instance data</li>
 * <li>Whether the clusterer can handle missing predictor values</li>
 * <li>Whether the clusterer can handle instance weights</li>
 * </ul>
 * </li>
 * <li>Correct functioning
 * <ul>
 * <li>Correct initialisation during buildClusterer (i.e. no result changes when
 * buildClusterer called repeatedly)</li>
 * <li>Whether the clusterer alters the data pased to it (number of instances,
 * instance order, instance weights, etc)</li>
 * </ul>
 * </li>
 * <li>Degenerate cases
 * <ul>
 * <li>building clusterer with zero training instances</li>
 * <li>all but one predictor attribute values missing</li>
 * <li>all predictor attribute values missing</li>
 * <li>all but one class values missing</li>
 * <li>all class values missing</li>
 * </ul>
 * </li>
 * </ul>
 * Running CheckClusterer with the debug option set will output the training
 * dataset for any failed tests.
 * <p/>
 * 
 * The <code>weka.clusterers.AbstractClustererTest</code> uses this class to
 * test all the clusterers. Any changes here, have to be checked in that
 * abstract test class, too.
 * <p/>
 * 
 * <!-- options-start --> Valid options are:
 * <p/>
 * 
 * <pre>
 * -D
 *  Turn on debugging output.
 * </pre>
 * 
 * <pre>
 * -S
 *  Silent mode - prints nothing to stdout.
 * </pre>
 * 
 * <pre>
 * -N &lt;num&gt;
 *  The number of instances in the datasets (default 20).
 * </pre>
 * 
 * <pre>
 * -nominal &lt;num&gt;
 *  The number of nominal attributes (default 2).
 * </pre>
 * 
 * <pre>
 * -nominal-values &lt;num&gt;
 *  The number of values for nominal attributes (default 1).
 * </pre>
 * 
 * <pre>
 * -numeric &lt;num&gt;
 *  The number of numeric attributes (default 1).
 * </pre>
 * 
 * <pre>
 * -string &lt;num&gt;
 *  The number of string attributes (default 1).
 * </pre>
 * 
 * <pre>
 * -date &lt;num&gt;
 *  The number of date attributes (default 1).
 * </pre>
 * 
 * <pre>
 * -relational &lt;num&gt;
 *  The number of relational attributes (default 1).
 * </pre>
 * 
 * <pre>
 * -num-instances-relational &lt;num&gt;
 *  The number of instances in relational/bag attributes (default 10).
 * </pre>
 * 
 * <pre>
 * -words &lt;comma-separated-list&gt;
 *  The words to use in string attributes.
 * </pre>
 * 
 * <pre>
 * -word-separators &lt;chars&gt;
 *  The word separators to use in string attributes.
 * </pre>
 * 
 * <pre>
 * -W
 *  Full name of the clusterer analyzed.
 *  eg: weka.clusterers.SimpleKMeans
 *  (default weka.clusterers.SimpleKMeans)
 * </pre>
 * 
 * <pre>
 * Options specific to clusterer weka.clusterers.SimpleKMeans:
 * </pre>
 * 
 * <pre>
 * -N &lt;num&gt;
 *  number of clusters.
 *  (default 2).
 * </pre>
 * 
 * <pre>
 * -V
 *  Display std. deviations for centroids.
 * </pre>
 * 
 * <pre>
 * -M
 *  Replace missing values with mean/mode.
 * </pre>
 * 
 * <pre>
 * -S &lt;num&gt;
 *  Random number seed.
 *  (default 10)
 * </pre>
 * 
 * <!-- options-end -->
 * 
 * Options after -- are passed to the designated clusterer.
 * <p/>
 * 
 * @author Len Trigg (trigg@cs.waikato.ac.nz)
 * @author FracPete (fracpete at waikato dot ac dot nz)
 * @version $Revision$
 * @see TestInstances
 */
public class CheckClusterer extends CheckScheme {

    /*
     * Note about test methods: - methods return array of booleans - first index:
     * success or not - second index: acceptable or not (e.g., Exception is OK)
     * 
     * FracPete (fracpete at waikato dot ac dot nz)
     */

    /*** The clusterer to be examined */
    protected Clusterer m_Clusterer = new SimpleKMeans();

    /**
     * default constructor
     */
    public CheckClusterer() {
        super();

        setNumInstances(40);
    }

    /**
     * Returns an enumeration describing the available options.
     * 
     * @return an enumeration of all the available options.
     */
    @Override
    public Enumeration<Option> listOptions() {
        Vector<Option> result = new Vector<Option>();

        result.addElement(new Option("\tFull name of the clusterer analyzed.\n" + "\teg: weka.clusterers.SimpleKMeans\n" + "\t(default weka.clusterers.SimpleKMeans)", "W", 1, "-W"));

        result.addAll(Collections.list(super.listOptions()));

        if ((m_Clusterer != null) && (m_Clusterer instanceof OptionHandler)) {
            result.addElement(new Option("", "", 0, "\nOptions specific to clusterer " + m_Clusterer.getClass().getName() + ":"));

            result.addAll(Collections.list(((OptionHandler) m_Clusterer).listOptions()));
        }

        return result.elements();
    }

    /**
     * Parses a given list of options.
     * <p/>
     * 
     * <!-- options-start --> Valid options are:
     * <p/>
     * 
     * <pre>
     * -D
     *  Turn on debugging output.
     * </pre>
     * 
     * <pre>
     * -S
     *  Silent mode - prints nothing to stdout.
     * </pre>
     * 
     * <pre>
     * -N &lt;num&gt;
     *  The number of instances in the datasets (default 20).
     * </pre>
     * 
     * <pre>
     * -nominal &lt;num&gt;
     *  The number of nominal attributes (default 2).
     * </pre>
     * 
     * <pre>
     * -nominal-values &lt;num&gt;
     *  The number of values for nominal attributes (default 1).
     * </pre>
     * 
     * <pre>
     * -numeric &lt;num&gt;
     *  The number of numeric attributes (default 1).
     * </pre>
     * 
     * <pre>
     * -string &lt;num&gt;
     *  The number of string attributes (default 1).
     * </pre>
     * 
     * <pre>
     * -date &lt;num&gt;
     *  The number of date attributes (default 1).
     * </pre>
     * 
     * <pre>
     * -relational &lt;num&gt;
     *  The number of relational attributes (default 1).
     * </pre>
     * 
     * <pre>
     * -num-instances-relational &lt;num&gt;
     *  The number of instances in relational/bag attributes (default 10).
     * </pre>
     * 
     * <pre>
     * -words &lt;comma-separated-list&gt;
     *  The words to use in string attributes.
     * </pre>
     * 
     * <pre>
     * -word-separators &lt;chars&gt;
     *  The word separators to use in string attributes.
     * </pre>
     * 
     * <pre>
     * -W
     *  Full name of the clusterer analyzed.
     *  eg: weka.clusterers.SimpleKMeans
     *  (default weka.clusterers.SimpleKMeans)
     * </pre>
     * 
     * <pre>
     * Options specific to clusterer weka.clusterers.SimpleKMeans:
     * </pre>
     * 
     * <pre>
     * -N &lt;num&gt;
     *  number of clusters.
     *  (default 2).
     * </pre>
     * 
     * <pre>
     * -V
     *  Display std. deviations for centroids.
     * </pre>
     * 
     * <pre>
     * -M
     *  Replace missing values with mean/mode.
     * </pre>
     * 
     * <pre>
     * -S &lt;num&gt;
     *  Random number seed.
     *  (default 10)
     * </pre>
     * 
     * <!-- options-end -->
     * 
     * @param options the list of options as an array of strings
     * @throws Exception if an option is not supported
     */
    @Override
    public void setOptions(String[] options) throws Exception {
        String tmpStr;

        tmpStr = Utils.getOption('N', options);

        if (tmpStr.length() != 0) {
            setNumInstances(Integer.parseInt(tmpStr));
        } else {
            setNumInstances(40);
        }

        super.setOptions(options);

        tmpStr = Utils.getOption('W', options);
        if (tmpStr.length() == 0) {
            tmpStr = weka.clusterers.SimpleKMeans.class.getName();
        }
        setClusterer((Clusterer) forName("weka.clusterers", Clusterer.class, tmpStr, Utils.partitionOptions(options)));

        Utils.checkForRemainingOptions(options);
    }

    /**
     * Gets the current settings of the CheckClusterer.
     * 
     * @return an array of strings suitable for passing to setOptions
     */
    @Override
    public String[] getOptions() {
        Vector<String> result = new Vector<String>();

        if (getClusterer() != null) {
            result.add("-W");
            result.add(getClusterer().getClass().getName());
        }

        Collections.addAll(result, super.getOptions());

        if ((m_Clusterer != null) && (m_Clusterer instanceof OptionHandler)) {
            String[] options = ((OptionHandler) m_Clusterer).getOptions();

            if (options.length > 0) {
                result.add("--");
                Collections.addAll(result, options);
            }
        }

        return result.toArray(new String[result.size()]);
    }

    /**
     * Begin the tests, reporting results to System.out
     */
    @Override
    public void doTests() {

        if (getClusterer() == null) {
            println("\n=== No clusterer set ===");
            return;
        }
        println("\n=== Check on Clusterer: " + getClusterer().getClass().getName() + " ===\n");

        // Start tests
        println("--> Checking for interfaces");
        canTakeOptions();
        boolean updateable = updateableClusterer()[0];
        boolean weightedInstancesHandler = weightedInstancesHandler()[0];
        boolean multiInstanceHandler = multiInstanceHandler()[0];
        println("--> Clusterer tests");
        declaresSerialVersionUID();
        runTests(weightedInstancesHandler, multiInstanceHandler, updateable);
    }

    /**
     * Set the clusterer for testing.
     * 
     * @param newClusterer the Clusterer to use.
     */
    public void setClusterer(Clusterer newClusterer) {
        m_Clusterer = newClusterer;
    }

    /**
     * Get the clusterer used as the clusterer
     * 
     * @return the clusterer used as the clusterer
     */
    public Clusterer getClusterer() {
        return m_Clusterer;
    }

    /**
     * Run a battery of tests
     * 
     * @param weighted      true if the clusterer says it handles weights
     * @param multiInstance true if the clusterer is a multi-instance clusterer
     * @param updateable    true if the classifier is updateable
     */
    protected void runTests(boolean weighted, boolean multiInstance, boolean updateable) {

        boolean PNom = canPredict(true, false, false, false, false, multiInstance)[0];
        boolean PNum = canPredict(false, true, false, false, false, multiInstance)[0];
        boolean PStr = canPredict(false, false, true, false, false, multiInstance)[0];
        boolean PDat = canPredict(false, false, false, true, false, multiInstance)[0];
        boolean PRel;
        if (!multiInstance) {
            PRel = canPredict(false, false, false, false, true, multiInstance)[0];
        } else {
            PRel = false;
        }

        if (PNom || PNum || PStr || PDat || PRel) {
            if (weighted) {
                instanceWeights(PNom, PNum, PStr, PDat, PRel, multiInstance);
            }

            canHandleZeroTraining(PNom, PNum, PStr, PDat, PRel, multiInstance);
            boolean handleMissingPredictors = canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance, true, 20)[0];
            if (handleMissingPredictors) {
                canHandleMissing(PNom, PNum, PStr, PDat, PRel, multiInstance, true, 100);
            }

            correctBuildInitialisation(PNom, PNum, PStr, PDat, PRel, multiInstance);
            datasetIntegrity(PNom, PNum, PStr, PDat, PRel, multiInstance, handleMissingPredictors);
            if (updateable) {
                updatingEquality(PNom, PNum, PStr, PDat, PRel, multiInstance);
            }
        }
    }

    /**
     * Checks whether the scheme can take command line options.
     * 
     * @return index 0 is true if the clusterer can take options
     */
    protected boolean[] canTakeOptions() {

        boolean[] result = new boolean[2];

        print("options...");
        if (m_Clusterer instanceof OptionHandler) {
            println("yes");
            if (m_Debug) {
                println("\n=== Full report ===");
                Enumeration<Option> enu = ((OptionHandler) m_Clusterer).listOptions();
                while (enu.hasMoreElements()) {
                    Option option = enu.nextElement();
                    print(option.synopsis() + "\n" + option.description() + "\n");
                }
                println("\n");
            }
            result[0] = true;
        } else {
            println("no");
            result[0] = false;
        }

        return result;
    }

    /**
     * Checks whether the scheme can build models incrementally.
     * 
     * @return index 0 is true if the clusterer can train incrementally
     */
    protected boolean[] updateableClusterer() {

        boolean[] result = new boolean[2];

        print("updateable clusterer...");
        if (m_Clusterer instanceof UpdateableClusterer) {
            println("yes");
            result[0] = true;
        } else {
            println("no");
            result[0] = false;
        }

        return result;
    }

    /**
     * Checks whether the scheme says it can handle instance weights.
     * 
     * @return true if the clusterer handles instance weights
     */
    protected boolean[] weightedInstancesHandler() {

        boolean[] result = new boolean[2];

        print("weighted instances clusterer...");
        if (m_Clusterer instanceof WeightedInstancesHandler) {
            println("yes");
            result[0] = true;
        } else {
            println("no");
            result[0] = false;
        }

        return result;
    }

    /**
     * Checks whether the scheme handles multi-instance data.
     * 
     * @return true if the clusterer handles multi-instance data
     */
    protected boolean[] multiInstanceHandler() {
        boolean[] result = new boolean[2];

        print("multi-instance clusterer...");
        if (m_Clusterer instanceof MultiInstanceCapabilitiesHandler) {
            println("yes");
            result[0] = true;
        } else {
            println("no");
            result[0] = false;
        }

        return result;
    }

    /**
     * tests for a serialVersionUID. Fails in case the scheme doesn't declare a UID.
     * 
     * @return index 0 is true if the scheme declares a UID
     */
    protected boolean[] declaresSerialVersionUID() {
        boolean[] result = new boolean[2];

        print("serialVersionUID...");

        result[0] = !SerializationHelper.needsUID(m_Clusterer.getClass());

        if (result[0]) {
            println("yes");
        } else {
            println("no");
        }

        return result;
    }

    /**
     * Checks basic prediction of the scheme, for simple non-troublesome datasets.
     * 
     * @param nominalPredictor    if true use nominal predictor attributes
     * @param numericPredictor    if true use numeric predictor attributes
     * @param stringPredictor     if true use string predictor attributes
     * @param datePredictor       if true use date predictor attributes
     * @param relationalPredictor if true use relational predictor attributes
     * @param multiInstance       whether multi-instance is needed
     * @return index 0 is true if the test was passed, index 1 is true if test was
     *         acceptable
     */
    protected boolean[] canPredict(boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) {

        print("basic predict");
        printAttributeSummary(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
        print("...");
        Vector<String> accepts = new Vector<String>();
        accepts.addElement("unary");
        accepts.addElement("binary");
        accepts.addElement("nominal");
        accepts.addElement("numeric");
        accepts.addElement("string");
        accepts.addElement("date");
        accepts.addElement("relational");
        accepts.addElement("multi-instance");
        accepts.addElement("not in classpath");
        int numTrain = getNumInstances(), missingLevel = 0;
        boolean predictorMissing = false;

        return runBasicTest(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, missingLevel, predictorMissing, numTrain, accepts);
    }

    /**
     * Checks whether the scheme can handle zero training instances.
     * 
     * @param nominalPredictor    if true use nominal predictor attributes
     * @param numericPredictor    if true use numeric predictor attributes
     * @param stringPredictor     if true use string predictor attributes
     * @param datePredictor       if true use date predictor attributes
     * @param relationalPredictor if true use relational predictor attributes
     * @param multiInstance       whether multi-instance is needed
     * @return index 0 is true if the test was passed, index 1 is true if test was
     *         acceptable
     */
    protected boolean[] canHandleZeroTraining(boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) {

        print("handle zero training instances");
        printAttributeSummary(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
        print("...");
        Vector<String> accepts = new Vector<String>();
        accepts.addElement("train");
        accepts.addElement("value");
        int numTrain = 0, missingLevel = 0;
        boolean predictorMissing = false;

        return runBasicTest(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, missingLevel, predictorMissing, numTrain, accepts);
    }

    /**
     * Checks whether the scheme correctly initialises models when buildClusterer is
     * called. This test calls buildClusterer with one training dataset.
     * buildClusterer is then called on a training set with different structure, and
     * then again with the original training set. If the equals method of the
     * ClusterEvaluation class returns false, this is noted as incorrect build
     * initialisation.
     * 
     * @param nominalPredictor    if true use nominal predictor attributes
     * @param numericPredictor    if true use numeric predictor attributes
     * @param stringPredictor     if true use string predictor attributes
     * @param datePredictor       if true use date predictor attributes
     * @param relationalPredictor if true use relational predictor attributes
     * @param multiInstance       whether multi-instance is needed
     * @return index 0 is true if the test was passed
     */
    protected boolean[] correctBuildInitialisation(boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) {

        boolean[] result = new boolean[2];

        print("correct initialisation during buildClusterer");
        printAttributeSummary(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
        print("...");
        int numTrain = getNumInstances(), missingLevel = 0;
        boolean predictorMissing = false;

        Instances train1 = null;
        Instances train2 = null;
        Clusterer clusterer = null;
        ClusterEvaluation evaluation1A = null;
        ClusterEvaluation evaluation1B = null;
        ClusterEvaluation evaluation2 = null;
        boolean built = false;
        int stage = 0;
        try {

            // Make two train sets with different numbers of attributes
            train1 = makeTestDataset(42, numTrain, nominalPredictor ? getNumNominal() : 0, numericPredictor ? getNumNumeric() : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance);
            train2 = makeTestDataset(84, numTrain, nominalPredictor ? getNumNominal() + 1 : 0, numericPredictor ? getNumNumeric() + 1 : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance);
            if (nominalPredictor && !multiInstance) {
                train1.deleteAttributeAt(0);
                train2.deleteAttributeAt(0);
            }
            if (missingLevel > 0) {
                addMissing(train1, missingLevel, predictorMissing);
                addMissing(train2, missingLevel, predictorMissing);
            }

            clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0];
            evaluation1A = new ClusterEvaluation();
            evaluation1B = new ClusterEvaluation();
            evaluation2 = new ClusterEvaluation();
        } catch (Exception ex) {
            throw new Error("Error setting up for tests: " + ex.getMessage());
        }
        try {
            stage = 0;
            clusterer.buildClusterer(train1);
            built = true;
            evaluation1A.setClusterer(clusterer);
            evaluation1A.evaluateClusterer(train1);

            stage = 1;
            built = false;
            clusterer.buildClusterer(train2);
            built = true;
            evaluation2.setClusterer(clusterer);
            evaluation2.evaluateClusterer(train2);

            stage = 2;
            built = false;
            clusterer.buildClusterer(train1);
            built = true;
            evaluation1B.setClusterer(clusterer);
            evaluation1B.evaluateClusterer(train1);

            stage = 3;
            if (!evaluation1A.equals(evaluation1B)) {
                if (m_Debug) {
                    println("\n=== Full report ===\n");
                    println("First buildClusterer()");
                    println(evaluation1A.clusterResultsToString() + "\n\n");
                    println("Second buildClusterer()");
                    println(evaluation1B.clusterResultsToString() + "\n\n");
                }
                throw new Exception("Results differ between buildClusterer calls");
            }
            println("yes");
            result[0] = true;

        } catch (Exception ex) {
            println("no");
            result[0] = false;
            if (m_Debug) {
                println("\n=== Full Report ===");
                print("Problem during");
                if (built) {
                    print(" testing");
                } else {
                    print(" training");
                }
                switch (stage) {
                case 0:
                    print(" of dataset 1");
                    break;
                case 1:
                    print(" of dataset 2");
                    break;
                case 2:
                    print(" of dataset 1 (2nd build)");
                    break;
                case 3:
                    print(", comparing results from builds of dataset 1");
                    break;
                }
                println(": " + ex.getMessage() + "\n");
                println("here are the datasets:\n");
                println("=== Train1 Dataset ===\n" + train1.toString() + "\n");
                println("=== Train2 Dataset ===\n" + train2.toString() + "\n");
            }
        }

        return result;
    }

    /**
     * Checks basic missing value handling of the scheme. If the missing values
     * cause an exception to be thrown by the scheme, this will be recorded.
     * 
     * @param nominalPredictor    if true use nominal predictor attributes
     * @param numericPredictor    if true use numeric predictor attributes
     * @param stringPredictor     if true use string predictor attributes
     * @param datePredictor       if true use date predictor attributes
     * @param relationalPredictor if true use relational predictor attributes
     * @param multiInstance       whether multi-instance is needed
     * @param predictorMissing    true if the missing values may be in the
     *                            predictors
     * @param missingLevel        the percentage of missing values
     * @return index 0 is true if the test was passed, index 1 is true if test was
     *         acceptable
     */
    protected boolean[] canHandleMissing(boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance, boolean predictorMissing, int missingLevel) {

        if (missingLevel == 100) {
            print("100% ");
        }
        print("missing");
        if (predictorMissing) {
            print(" predictor");
        }
        print(" values");
        printAttributeSummary(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
        print("...");
        Vector<String> accepts = new Vector<String>();
        accepts.addElement("missing");
        accepts.addElement("value");
        accepts.addElement("train");
        int numTrain = getNumInstances();

        return runBasicTest(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance, missingLevel, predictorMissing, numTrain, accepts);
    }

    /**
     * Checks whether the clusterer can handle instance weights. This test compares
     * the clusterer performance on two datasets that are identical except for the
     * training weights. If the results change, then the clusterer must be using the
     * weights. It may be possible to get a false positive from this test if the
     * weight changes aren't significant enough to induce a change in clusterer
     * performance (but the weights are chosen to minimize the likelihood of this).
     * 
     * @param nominalPredictor    if true use nominal predictor attributes
     * @param numericPredictor    if true use numeric predictor attributes
     * @param stringPredictor     if true use string predictor attributes
     * @param datePredictor       if true use date predictor attributes
     * @param relationalPredictor if true use relational predictor attributes
     * @param multiInstance       whether multi-instance is needed
     * @return index 0 true if the test was passed
     */
    protected boolean[] instanceWeights(boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) {

        print("clusterer uses instance weights");
        printAttributeSummary(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
        print("...");
        int numTrain = 2 * getNumInstances(), missingLevel = 0;
        boolean predictorMissing = false;

        boolean[] result = new boolean[2];
        Instances train = null;
        Clusterer[] clusterers = null;
        ClusterEvaluation evaluationB = null;
        ClusterEvaluation evaluationI = null;
        boolean built = false;
        boolean evalFail = false;
        try {
            train = makeTestDataset(42, numTrain, nominalPredictor ? getNumNominal() + 1 : 0, numericPredictor ? getNumNumeric() + 1 : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance);
            if (nominalPredictor && !multiInstance) {
                train.deleteAttributeAt(0);
            }
            if (missingLevel > 0) {
                addMissing(train, missingLevel, predictorMissing);
            }
            clusterers = AbstractClusterer.makeCopies(getClusterer(), 2);
            evaluationB = new ClusterEvaluation();
            evaluationI = new ClusterEvaluation();
            clusterers[0].buildClusterer(train);
            evaluationB.setClusterer(clusterers[0]);
            evaluationB.evaluateClusterer(train);
        } catch (Exception ex) {
            throw new Error("Error setting up for tests: " + ex.getMessage());
        }
        try {

            // Now modify instance weights and re-built/test
            for (int i = 0; i < train.numInstances(); i++) {
                train.instance(i).setWeight(0);
            }
            Random random = new Random(1);
            for (int i = 0; i < train.numInstances() / 2; i++) {
                int inst = random.nextInt(train.numInstances());
                int weight = random.nextInt(10) + 1;
                train.instance(inst).setWeight(weight);
            }
            clusterers[1].buildClusterer(train);
            built = true;
            evaluationI.setClusterer(clusterers[1]);
            evaluationI.evaluateClusterer(train);
            if (evaluationB.equals(evaluationI)) {
                // println("no");
                evalFail = true;
                throw new Exception("evalFail");
            }

            println("yes");
            result[0] = true;
        } catch (Exception ex) {
            println("no");
            result[0] = false;

            if (m_Debug) {
                println("\n=== Full Report ===");

                if (evalFail) {
                    println("Results don't differ between non-weighted and " + "weighted instance models.");
                    println("Here are the results:\n");
                    println("\nboth methods\n");
                    println(evaluationB.clusterResultsToString());
                } else {
                    print("Problem during");
                    if (built) {
                        print(" testing");
                    } else {
                        print(" training");
                    }
                    println(": " + ex.getMessage() + "\n");
                }
                println("Here is the dataset:\n");
                println("=== Train Dataset ===\n" + train.toString() + "\n");
                println("=== Train Weights ===\n");
                for (int i = 0; i < train.numInstances(); i++) {
                    println(" " + (i + 1) + "    " + train.instance(i).weight());
                }
            }
        }

        return result;
    }

    /**
     * Checks whether the scheme alters the training dataset during training. If the
     * scheme needs to modify the training data it should take a copy of the
     * training data. Currently checks for changes to header structure, number of
     * instances, order of instances, instance weights.
     * 
     * @param nominalPredictor    if true use nominal predictor attributes
     * @param numericPredictor    if true use numeric predictor attributes
     * @param stringPredictor     if true use string predictor attributes
     * @param datePredictor       if true use date predictor attributes
     * @param relationalPredictor if true use relational predictor attributes
     * @param multiInstance       whether multi-instance is needed
     * @param predictorMissing    true if we know the clusterer can handle (at
     *                            least) moderate missing predictor values
     * @return index 0 is true if the test was passed
     */
    protected boolean[] datasetIntegrity(boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance, boolean predictorMissing) {

        print("clusterer doesn't alter original datasets");
        printAttributeSummary(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
        print("...");
        int numTrain = getNumInstances(), missingLevel = 20;

        boolean[] result = new boolean[2];
        Instances train = null;
        Clusterer clusterer = null;
        try {
            train = makeTestDataset(42, numTrain, nominalPredictor ? getNumNominal() : 0, numericPredictor ? getNumNumeric() : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance);
            if (nominalPredictor && !multiInstance) {
                train.deleteAttributeAt(0);
            }
            if (missingLevel > 0) {
                addMissing(train, missingLevel, predictorMissing);
            }
            clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0];
        } catch (Exception ex) {
            throw new Error("Error setting up for tests: " + ex.getMessage());
        }
        try {
            Instances trainCopy = new Instances(train);
            clusterer.buildClusterer(trainCopy);
            compareDatasets(train, trainCopy);

            println("yes");
            result[0] = true;
        } catch (Exception ex) {
            println("no");
            result[0] = false;

            if (m_Debug) {
                println("\n=== Full Report ===");
                print("Problem during training");
                println(": " + ex.getMessage() + "\n");
                println("Here is the dataset:\n");
                println("=== Train Dataset ===\n" + train.toString() + "\n");
            }
        }

        return result;
    }

    /**
     * Checks whether an updateable scheme produces the same model when trained
     * incrementally as when batch trained. The model itself cannot be compared, so
     * we compare the evaluation on test data for both models. It is possible to get
     * a false positive on this test (likelihood depends on the classifier).
     * 
     * @param nominalPredictor    if true use nominal predictor attributes
     * @param numericPredictor    if true use numeric predictor attributes
     * @param stringPredictor     if true use string predictor attributes
     * @param datePredictor       if true use date predictor attributes
     * @param relationalPredictor if true use relational predictor attributes
     * @param multiInstance       whether multi-instance is needed
     * @return index 0 is true if the test was passed
     */
    protected boolean[] updatingEquality(boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) {

        print("incremental training produces the same results" + " as batch training");
        printAttributeSummary(nominalPredictor, numericPredictor, stringPredictor, datePredictor, relationalPredictor, multiInstance);
        print("...");
        int numTrain = getNumInstances(), missingLevel = 0;
        boolean predictorMissing = false, classMissing = false;

        boolean[] result = new boolean[2];
        Instances train = null;
        Clusterer[] clusterers = null;
        ClusterEvaluation evaluationB = null;
        ClusterEvaluation evaluationI = null;
        boolean built = false;
        try {
            train = makeTestDataset(42, numTrain, nominalPredictor ? getNumNominal() : 0, numericPredictor ? getNumNumeric() : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance);
            if (missingLevel > 0) {
                addMissing(train, missingLevel, predictorMissing, classMissing);
            }
            clusterers = AbstractClusterer.makeCopies(getClusterer(), 2);
            evaluationB = new ClusterEvaluation();
            evaluationI = new ClusterEvaluation();
            clusterers[0].buildClusterer(train);
            evaluationB.setClusterer(clusterers[0]);
            evaluationB.evaluateClusterer(train);
        } catch (Exception ex) {
            throw new Error("Error setting up for tests: " + ex.getMessage());
        }
        try {
            clusterers[1].buildClusterer(new Instances(train, 0));
            for (int i = 0; i < train.numInstances(); i++) {
                ((UpdateableClusterer) clusterers[1]).updateClusterer(train.instance(i));
            }
            built = true;
            evaluationI.setClusterer(clusterers[1]);
            evaluationI.evaluateClusterer(train);
            if (!evaluationB.equals(evaluationI)) {
                println("no");
                result[0] = false;

                if (m_Debug) {
                    println("\n=== Full Report ===");
                    println("Results differ between batch and " + "incrementally built models.\n" + "Depending on the classifier, this may be OK");
                    println("Here are the results:\n");
                    println("\nbatch built results\n" + evaluationB.clusterResultsToString());
                    println("\nincrementally built results\n" + evaluationI.clusterResultsToString());
                    println("Here are the datasets:\n");
                    println("=== Train Dataset ===\n" + train.toString() + "\n");
                }
            } else {
                println("yes");
                result[0] = true;
            }
        } catch (Exception ex) {
            result[0] = false;

            print("Problem during");
            if (built) {
                print(" testing");
            } else {
                print(" training");
            }
            println(": " + ex.getMessage() + "\n");
        }

        return result;
    }

    /**
     * Runs a text on the datasets with the given characteristics.
     * 
     * @param nominalPredictor    if true use nominal predictor attributes
     * @param numericPredictor    if true use numeric predictor attributes
     * @param stringPredictor     if true use string predictor attributes
     * @param datePredictor       if true use date predictor attributes
     * @param relationalPredictor if true use relational predictor attributes
     * @param multiInstance       whether multi-instance is needed
     * @param missingLevel        the percentage of missing values
     * @param predictorMissing    true if the missing values may be in the
     *                            predictors
     * @param numTrain            the number of instances in the training set
     * @param accepts             the acceptable string in an exception
     * @return index 0 is true if the test was passed, index 1 is true if test was
     *         acceptable
     */
    protected boolean[] runBasicTest(boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance, int missingLevel, boolean predictorMissing, int numTrain, Vector<String> accepts) {

        boolean[] result = new boolean[2];
        Instances train = null;
        Clusterer clusterer = null;
        try {
            train = makeTestDataset(42, numTrain, nominalPredictor ? getNumNominal() : 0, numericPredictor ? getNumNumeric() : 0, stringPredictor ? getNumString() : 0, datePredictor ? getNumDate() : 0, relationalPredictor ? getNumRelational() : 0, multiInstance);
            if (nominalPredictor && !multiInstance) {
                train.deleteAttributeAt(0);
            }
            if (missingLevel > 0) {
                addMissing(train, missingLevel, predictorMissing);
            }
            clusterer = AbstractClusterer.makeCopies(getClusterer(), 1)[0];
        } catch (Exception ex) {
            ex.printStackTrace();
            throw new Error("Error setting up for tests: " + ex.getMessage());
        }
        try {
            clusterer.buildClusterer(train);
            println("yes");
            result[0] = true;
        } catch (Exception ex) {
            boolean acceptable = false;
            String msg = ex.getMessage().toLowerCase();
            for (int i = 0; i < accepts.size(); i++) {
                if (msg.indexOf(accepts.elementAt(i)) >= 0) {
                    acceptable = true;
                }
            }

            println("no" + (acceptable ? " (OK error message)" : ""));
            result[1] = acceptable;

            if (m_Debug) {
                println("\n=== Full Report ===");
                print("Problem during training");
                println(": " + ex.getMessage() + "\n");
                if (!acceptable) {
                    if (accepts.size() > 0) {
                        print("Error message doesn't mention ");
                        for (int i = 0; i < accepts.size(); i++) {
                            if (i != 0) {
                                print(" or ");
                            }
                            print('"' + accepts.elementAt(i) + '"');
                        }
                    }
                    println("here is the dataset:\n");
                    println("=== Train Dataset ===\n" + train.toString() + "\n");
                }
            }
        }

        return result;
    }

    /**
     * Add missing values to a dataset.
     * 
     * @param data             the instances to add missing values to
     * @param level            the level of missing values to add (if positive, this
     *                         is the probability that a value will be set to
     *                         missing, if negative all but one value will be set to
     *                         missing (not yet implemented))
     * @param predictorMissing if true, predictor attributes will be modified
     */
    protected void addMissing(Instances data, int level, boolean predictorMissing) {

        Random random = new Random(1);
        for (int i = 0; i < data.numInstances(); i++) {
            Instance current = data.instance(i);
            for (int j = 0; j < data.numAttributes(); j++) {
                if (predictorMissing) {
                    if (random.nextInt(100) < level) {
                        current.setMissing(j);
                    }
                }
            }
        }
    }

    /**
     * Make a simple set of instances with variable position of the class attribute,
     * which can later be modified for use in specific tests.
     * 
     * @param seed          the random number seed
     * @param numInstances  the number of instances to generate
     * @param numNominal    the number of nominal attributes
     * @param numNumeric    the number of numeric attributes
     * @param numString     the number of string attributes
     * @param numDate       the number of date attributes
     * @param numRelational the number of relational attributes
     * @param multiInstance whether the dataset should a multi-instance dataset
     * @return the test dataset
     * @throws Exception if the dataset couldn't be generated
     * @see TestInstances#CLASS_IS_LAST
     */
    protected Instances makeTestDataset(int seed, int numInstances, int numNominal, int numNumeric, int numString, int numDate, int numRelational, boolean multiInstance) throws Exception {

        TestInstances dataset = new TestInstances();

        dataset.setSeed(seed);
        dataset.setNumInstances(numInstances);
        dataset.setNumNominal(numNominal);
        dataset.setNumNumeric(numNumeric);
        dataset.setNumString(numString);
        dataset.setNumDate(numDate);
        dataset.setNumRelational(numRelational);
        dataset.setClassIndex(TestInstances.NO_CLASS);
        dataset.setMultiInstance(multiInstance);

        return dataset.generate();
    }

    /**
     * Print out a short summary string for the dataset characteristics
     * 
     * @param nominalPredictor    true if nominal predictor attributes are present
     * @param numericPredictor    true if numeric predictor attributes are present
     * @param stringPredictor     true if string predictor attributes are present
     * @param datePredictor       true if date predictor attributes are present
     * @param relationalPredictor true if relational predictor attributes are
     *                            present
     * @param multiInstance       whether multi-instance is needed
     */
    protected void printAttributeSummary(boolean nominalPredictor, boolean numericPredictor, boolean stringPredictor, boolean datePredictor, boolean relationalPredictor, boolean multiInstance) {

        String str = "";

        if (numericPredictor) {
            str += "numeric";
        }

        if (nominalPredictor) {
            if (str.length() > 0) {
                str += " & ";
            }
            str += "nominal";
        }

        if (stringPredictor) {
            if (str.length() > 0) {
                str += " & ";
            }
            str += "string";
        }

        if (datePredictor) {
            if (str.length() > 0) {
                str += " & ";
            }
            str += "date";
        }

        if (relationalPredictor) {
            if (str.length() > 0) {
                str += " & ";
            }
            str += "relational";
        }

        str = " (" + str + " predictors)";

        print(str);
    }

    /**
     * Test method for this class
     * 
     * @param args the commandline options
     */
    public static void main(String[] args) {
        runCheck(new CheckClusterer(), args);
    }
}
